Imputation

source("imputation.R", local = knitr::knit_global())
## Warning: package 'mice' was built under R version 4.1.3
##  reg  age  sex  hgt  wgt 
## 1021 1011  973  978  998

NAs stats in missing dataset

wgt_nas <- plot_na_pie("wgt")

## [1] 998

hgt_nas <- plot_na_pie("hgt")

## [1] 978

age_nas <- plot_na_pie("age")

## [1] 1011

MICE: Wight

MICE:compare the imputed datasets with orignal dataset

df_mice_wgt <- create_compare_data(data,miss_data,impt_mice_data,nas=wgt_nas,
                                   col = "wgt",method = "mice",sp_impt="method")
ggplot(df_mice_wgt, aes(age,wgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_mice_wgt, aes(source,wgt, colour = source))+geom_boxplot()

ggplot(df_mice_wgt, aes(source,wgt, colour = source))+geom_boxplot(aes(colour=sex))

MICE:compare split with Sex

df_mice_wgt <- create_compare_data(data,miss_data,impt_mice_data,nas=wgt_nas,col = "wgt",method = "mice",sp_impt="sex")
ggplot(df_mice_wgt, aes(age,wgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_mice_wgt, aes(source,wgt, colour = source))+geom_boxplot()

MICE:compare by NA counts

ggplot(df_mice_wgt, aes(age,wgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_mice_wgt, aes(na_count,wgt, colour = sex))+geom_boxplot()

ggplot(df_mice_wgt[grepl("4:|True",df_mice_wgt$na_count),], aes(age,wgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

MICE:compare split with age 14

# age above 14
ggplot(df_mice_wgt[df_mice_wgt$age >= 14,], aes(age,wgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

# age below 14
ggplot(df_mice_wgt[df_mice_wgt$age <14,], aes(age,wgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

Ranger: Wight

Ranger:compare the imputed datasets with orignal dataset

df_ranger_wgt <- create_compare_data(data,miss_data,impt_ranger_data,nas=wgt_nas,col = "wgt",method = "ranger",sp_impt="method")
ggplot(df_ranger_wgt, aes(age,wgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_ranger_wgt, aes(source,wgt, colour = source))+geom_boxplot()

ggplot(df_ranger_wgt, aes(source,wgt, colour = source))+geom_boxplot(aes(colour=sex))

Ranger:compare split with Sex

df_ranger_wgt <- create_compare_data(data,miss_data,impt_ranger_data,nas=wgt_nas,col = "wgt",method = "ranger",sp_impt="sex")
ggplot(df_ranger_wgt, aes(age,wgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_ranger_wgt, aes(source,wgt, colour = source))+geom_boxplot()

Ranger:compare by NA counts

ggplot(df_ranger_wgt, aes(age,wgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(df_ranger_wgt, aes(na_count,wgt, colour = sex))+geom_boxplot()

ggplot(df_ranger_wgt[grepl("4:|True",df_ranger_wgt$na_count),], aes(age,wgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Ranger:compare split with age 14

# age above 14
ggplot(df_ranger_wgt[df_ranger_wgt$age >= 14,], aes(age,wgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# age below 14
ggplot(df_ranger_wgt[df_ranger_wgt$age <14,], aes(age,wgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

MIDAS: Wight

MIDAS:compare the imputed datasets with orignal dataset

df_midas_wgt <- create_compare_data(data,miss_data,impt_rmidas_data,nas=wgt_nas,col = "wgt",method = "midas",sp_impt="method")
ggplot(df_midas_wgt, aes(age,wgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_midas_wgt, aes(source,wgt, colour = source))+geom_boxplot()

ggplot(df_midas_wgt, aes(source,wgt, colour = source))+geom_boxplot(aes(colour=sex))

MIDAS:compare split with Sex

df_midas_wgt <- create_compare_data(data,miss_data,impt_rmidas_data,nas=wgt_nas,col = "wgt",method = "midas",sp_impt="sex")
ggplot(df_midas_wgt, aes(age,wgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_midas_wgt, aes(source,wgt, colour = source))+geom_boxplot()

MIDAS:compare by NA counts

ggplot(df_midas_wgt, aes(age,wgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_midas_wgt, aes(na_count,wgt, colour = sex))+geom_boxplot()

ggplot(df_midas_wgt[grepl("4:|True",df_midas_wgt$na_count),], aes(age,wgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

MIDAS:compare split with age 14

# age above 14
ggplot(df_midas_wgt[df_midas_wgt$age >= 14,], aes(age,wgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

# age below 14
ggplot(df_midas_wgt[df_midas_wgt$age <14,], aes(age,wgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

MICE: Hight

MICE:compare the imputed datasets with orignal dataset

df_mice_hgt <- create_compare_data(data,miss_data,impt_mice_data,nas=hgt_nas,col = "hgt",method = "mice",sp_impt="method")
ggplot(df_mice_hgt, aes(age,hgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_mice_hgt, aes(source,hgt, colour = source))+geom_boxplot()

ggplot(df_mice_hgt, aes(source,hgt, colour = source))+geom_boxplot(aes(colour=sex))

MICE:compare split with Sex

df_mice_hgt <- create_compare_data(data,miss_data,impt_mice_data,nas=hgt_nas,col = "hgt",method = "mice",sp_impt="sex")
ggplot(df_mice_hgt, aes(age,hgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_mice_hgt, aes(source,hgt, colour = source))+geom_boxplot()

MICE:compare by NA counts

ggplot(df_mice_hgt, aes(age,hgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_mice_hgt, aes(na_count,hgt, colour = source))+geom_boxplot()

ggplot(df_mice_hgt[grepl("4:|True",df_mice_hgt$na_count),], aes(age,hgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

MICE:compare split with age 14

# age above 14
ggplot(df_mice_hgt[df_mice_hgt$age >= 14,], aes(age,hgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

# age below 14
ggplot(df_mice_hgt[df_mice_hgt$age <14,], aes(age,hgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

Ranger: Hight

Ranger:compare the imputed datasets with orignal dataset

df_ranger_hgt <- create_compare_data(data,miss_data,impt_ranger_data,nas=hgt_nas,col = "hgt",method = "ranger",sp_impt="method")
ggplot(df_ranger_hgt, aes(age,hgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_ranger_hgt, aes(source,hgt, colour = source))+geom_boxplot()

ggplot(df_ranger_hgt, aes(source,hgt, colour = source))+geom_boxplot(aes(colour=sex))

Ranger:compare split with Sex

df_ranger_hgt <- create_compare_data(data,miss_data,impt_ranger_data,nas=hgt_nas,col = "hgt",method = "ranger",sp_impt="sex")
ggplot(df_ranger_hgt, aes(age,hgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_ranger_hgt, aes(source,hgt, colour = source))+geom_boxplot()

Ranger:compare by NA counts

ggplot(df_ranger_hgt, aes(age,hgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_ranger_hgt, aes(na_count,hgt, colour = sex))+geom_boxplot()

ggplot(df_ranger_hgt[grepl("4:|True",df_ranger_hgt$na_count),], aes(age,hgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

Ranger:compare split with age 14

# age above 14
ggplot(df_ranger_hgt[df_ranger_hgt$age >= 14,], aes(age,hgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

# age below 14
ggplot(df_ranger_hgt[df_ranger_hgt$age <14,], aes(age,hgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

MIDAS: Hight

MIDAS:compare the imputed datasets with orignal dataset

df_midas_hgt <- create_compare_data(data,miss_data,impt_rmidas_data,nas=hgt_nas,col = "hgt",method = "midas",sp_impt="method")
ggplot(df_midas_hgt, aes(age,hgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_midas_hgt, aes(source,hgt, colour = source))+geom_boxplot()

ggplot(df_midas_hgt, aes(source,hgt, colour = source))+geom_boxplot(aes(colour=sex))

MIDAS:compare split with Sex

df_midas_hgt <- create_compare_data(data,miss_data,impt_rmidas_data,nas=hgt_nas,col = "hgt",method = "midas",sp_impt="sex")
ggplot(df_midas_hgt, aes(age,hgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_midas_hgt, aes(source,hgt, colour = source))+geom_boxplot()

MIDAS:compare by NA counts

ggplot(df_midas_hgt, aes(age,hgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_midas_hgt, aes(na_count,hgt, colour = sex))+geom_boxplot()

ggplot(df_midas_hgt[grepl("4:|True",df_midas_hgt$na_count),], aes(age,hgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

MIDAS:compare split with age 14

# age above 14
ggplot(df_midas_hgt[df_midas_hgt$age >= 14,], aes(age,hgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

# age below 14
ggplot(df_midas_hgt[df_midas_hgt$age <14,], aes(age,hgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

compare miss to true data:wgt

miss_index <- which(is.na(miss_data$wgt))
for (i in 1:10){
  sex <- factor(data$sex[miss_index])
  g1 <- qplot(data$wgt[miss_index],impt_mice_data[[i]]$wgt[miss_index],col=sex)+stat_smooth()+ylim(-10, 105)+
    ylab("mice wgt") + xlab("data wgt")+theme(legend.position = "top")
  
  g2 <- qplot(data$wgt[miss_index],impt_ranger_data[[i]]$wgt[miss_index],col=sex)+stat_smooth()+ylim(-10, 105)+
    ylab("ranger wgt") + xlab("data wgt")+theme(legend.position = "top")
  
  g3 <- qplot(data$wgt[miss_index],impt_rmidas_data[[i]]$wgt[miss_index],col=sex)+stat_smooth()+ylim(-10, 105)+
    ylab("midas wgt") + xlab("data wgt")+theme(legend.position = "top")
  grid.arrange(g1, g2,g3, ncol=3)
  
}
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 7 rows containing non-finite values (stat_smooth).
## Warning: Removed 7 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## Warning: Removed 4 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## Warning: Removed 5 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## Warning: Removed 5 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 7 rows containing non-finite values (stat_smooth).
## Warning: Removed 7 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 6 rows containing non-finite values (stat_smooth).
## Warning: Removed 6 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## Warning: Removed 5 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Warning: Removed 3 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Removed 3 rows containing missing values (geom_point).
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Warning: Removed 3 rows containing missing values (geom_point).
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

compare miss to true data:hgt

miss_index <- which(is.na(miss_data$hgt))
for (i in 1:10){
  sex <- factor(data$sex[miss_index])
  g1 <- qplot(data$hgt[miss_index],impt_mice_data[[i]]$hgt[miss_index],col=sex)+stat_smooth()+ylim(30, 215)+
    ylab("mice hgt") + xlab("data hgt")+theme(legend.position = "top")
  
  g2 <- qplot(data$hgt[miss_index],impt_ranger_data[[i]]$hgt[miss_index],col=sex)+stat_smooth()+ylim(30, 215)+
    ylab("ranger hgt") + xlab("data hgt")+theme(legend.position = "top")
  
  g3 <- qplot(data$hgt[miss_index],impt_rmidas_data[[i]]$hgt[miss_index],col=sex)+stat_smooth()+ylim(30, 215)+
    ylab("midas hgt") + xlab("data hgt")+theme(legend.position = "top")
  grid.arrange(g1, g2,g3, ncol=3)
  
}
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## Warning: Removed 4 rows containing missing values (geom_point).

## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Warning: Removed 3 rows containing missing values (geom_point).

## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).

## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Removed 2 rows containing missing values (geom_point).

## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Warning: Removed 3 rows containing missing values (geom_point).

## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## Warning: Removed 4 rows containing missing values (geom_point).

## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).

## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## Warning: Removed 5 rows containing missing values (geom_point).

## Warning: Removed 6 rows containing non-finite values (stat_smooth).
## Warning: Removed 6 rows containing missing values (geom_point).

## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).

compare miss to true data:age

miss_index <- which(is.na(miss_data$age))
for (i in 1:10){
  sex <- factor(data$sex[miss_index])
  g1 <- qplot(data$age[miss_index],impt_mice_data[[i]]$age[miss_index],col=sex)+stat_smooth()+ylim(-5,22)+
    ylab("mice age") + xlab("data age")+theme(legend.position = "top")
  
  g2 <- qplot(data$age[miss_index],impt_ranger_data[[i]]$age[miss_index],col=sex)+stat_smooth()+ylim(-5,22)+
    ylab("ranger age") + xlab("data age")+theme(legend.position = "top")
  
  g3 <- qplot(data$age[miss_index],impt_rmidas_data[[i]]$age[miss_index],col=sex)+stat_smooth()+ylim(-5,22)+
    ylab("midas age") + xlab("data age")+theme(legend.position = "top")
  grid.arrange(g1, g2,g3, ncol=3)
  
}